Multimapping Analysis

Load packages

First we will load in the packages needed to execute this analysis

library(rmdformats)
library(tidyverse)
library(magrittr)
library(edgeR)
library(ggplot2)
library(viridis)
library(ggfortify)
library(PCAtools)
library(AnnotationHub)
library(ensembldb)
library(pheatmap)
library(cowplot)
library(ggbeeswarm)
library(grid)
library(gridExtra)
library(tibble)
library(EnsDb.Hsapiens.v86)
library(here)

Some methods will be called “hisat2”, this just means that bootstrapping estimates have not been taken into account. Nothing super special going on.

I have also defined several functions here for quality of life purposes

source(here("R/cor_funcs_variance.R"))

Multi mapping analysis

Each method handles multiple mapping reads differently -Salmon and SA will calculate the probability of a multiple mapped read to each of its mapped transcripts -Kallisto does not report multiple mapping pseudoalignments -Hisat2 will search for multiple alignments and only report the best one

Seems like it really depends on what you wanna do which will dictate the method you use

There are options in both Kallisto and Hisat2 that can handle multiple mappings differently I think

Hisat2’s handling may also be the reason why there are way less ambiguous reads in its run

Load the data

# From 01_Annotations
txp_gene_ensdb_lengths <- read_csv(
  here("data/annotations/txp_gene_ensdb_lengths.csv.gz")
  )

gene_txp_anno <- read_csv(here("data/annotations/grch38_103_df.csv.gz"))

transcript_key <- dplyr::select(txp_gene_ensdb_lengths,
                                c("transcript_id" = tx_id,
                                  "transcript_name" = tx_name))

# From 02_dtelist_prep
hisat2_dtelist <- read_rds(here("data/counts/hisat2_dtelist.rds"))
kallisto_dtelist <- read_rds(here("data/counts/kallisto_dtelist.rds"))
salmon_dtelist <- read_rds(here("data/counts/sa_dtelist.rds"))
star_dtelist <- read_rds(here("data/counts/star_dtelist.rds"))

# From 04_principal_component_analysis
hisat2_loadings <- read_csv(
  here("data/pca/loadings/hisat2_loadings.csv")
  ) %>%
  head(200)

kallisto_loadings <- read_csv(
  here("data/pca/loadings/kallisto_loadings.csv")
  ) %>%
  head(200)

salmon_loadings <- read_csv(
  here("data/pca/loadings/salmon_loadings.csv")
  ) %>%
  head(200)

star_loadings <- read_csv(
  here("data/pca/loadings/star_loadings.csv")
  ) %>%
  head(200)

# From figure2_make.Rmd
hisat2_unique <- read_csv(here("data/unique_transcripts/hisat2_unique.csv.gz"))
kallisto_unique <- read_csv(
  here("data/unique_transcripts/kallisto_unique.csv.gz")
  )
salmon_unique <- read_csv(here("data/unique_transcripts/salmon_unique.csv.gz"))
star_unique <- read_csv(here("data/unique_transcripts/star_unique.csv.gz"))

# From 05_dte_and_dtu
## DTE
hisat2_dte_exclusive <- read_csv(
  here("data/differential_expression/hisat2_exclusive_tx.csv")
  ) %>%
  head(200)

kallisto_dte_exclusive <- read_csv(
  here("data/differential_expression/kallisto_exclusive_tx.csv")
  ) %>%
  head(200)

salmon_dte_exclusive <- read_csv(
  here("data/differential_expression/salmon_exclusive_tx.csv")
)

star_dte_exclusive <- read_csv(
  here("data/differential_expression/star_exclusive_tx.csv")
  ) %>%
  head(200)

all_methods_tx_dte <- read_csv(
  here("data/differential_expression/all_methods_tx.csv")
)

## DTU
hisat2_dtu_exclusive <- read_csv(
  here("data/differential_usage/hisat2_exclusive_dtu.csv")
  ) %>%
  dplyr::rename("transcript_id" = Transcript) %>%
  dplyr::arrange(txpFDR) %>%
  head(200)

kallisto_dtu_exclusive <- read_csv(
  here("data/differential_usage/kallisto_exclusive_dtu.csv")
  ) %>%
  dplyr::rename("transcript_id" = Transcript) %>%
  dplyr::arrange(txpFDR) %>%
  head(200)

star_dtu_exclusive <- read_csv(
  here("data/differential_usage/star_exclusive_dtu.csv")
  ) %>%
  dplyr::rename("transcript_id" = Transcript) %>%
  dplyr::arrange(txpFDR) %>%
  head(200)

salmon_dtu_exclusive <- read_csv(
  here("data/differential_usage/salmon_exclusive_dtu.csv")
  ) %>%
  dplyr::rename("transcript_id" = Transcript) %>%
  dplyr::arrange(txpFDR) %>%
  head(200)

all_methods_tx_dtu <- read_csv(
  here("data/differential_usage/all_methods_tx.csv")
  ) %>%
  dplyr::rename("transcript_id" = Transcript) %>%
  dplyr::arrange(txpFDR) %>%
  head(200)

Get Multi-Mapping Data

Quant Files

hisat2_quant_files <- character()
for(i in list.files(here("data/counts/HISAT2/hisat2_quant"))) {
 quant <- paste0(here("data/counts/HISAT2/hisat2_quant/"), i, "/quant.sf")
 hisat2_quant_files <- c(hisat2_quant_files, quant)
}
  
sa_quant_files <- character()
for(i in list.files(here("data/counts/SA/quant_files"))) {
 quant <- paste0(here("data/counts/SA/quant_files/"), i)
 sa_quant_files <- c(sa_quant_files, quant)
}

star_quant_files <- character()
for(i in list.files(here("data/counts/STAR/quant_files"))) {
 quant <- paste0(here("data/counts/STAR/quant_files/"), i)
 star_quant_files <- c(star_quant_files, quant)
}

Ambiguous Read Files

hisat2_ambig_files <- character()
for(i in list.files(here("data/counts/HISAT2/hisat2_quant"))) {
  ambig <- paste0(here("data/counts/HISAT2/hisat2_quant/"), i,
                  "/aux_info/ambig_info.tsv")
  hisat2_ambig_files <- c(hisat2_ambig_files, ambig)
}

sa_ambig_files <- character()
for(i in list.files(here("data/counts/SA/ambig_info"))) {
  ambig <- paste0(here("data/counts/SA/ambig_info/"), i)
  sa_ambig_files <- c(sa_ambig_files, ambig)
}

star_ambig_files <- character()
for(i in list.files(here("data/counts/STAR/ambig_info"))) {
  ambig <- paste0(here("data/counts/STAR/ambig_info/"), i)
  star_ambig_files <- c(star_ambig_files, ambig)
}

Set multi-mapping info function

multi_mapping <- function(quant, ambig) {
  x <- cbind(quant, ambig) %>%
    mutate(sum = TPM + NumReads + UniqueCount + AmbigCount) %>%
    dplyr::filter(sum > 0) %>%
    dplyr::select(-sum) %>%
    mutate(multimap = NumReads-UniqueCount) %>%
    mutate(percent_est_from_multimap = multimap/NumReads,
           percent_est_from_multimap = replace_na(percent_est_from_multimap, 0),
           percent_ambigs_used = multimap/AmbigCount,
           percent_ambigs_used = replace_na(percent_ambigs_used, 0),
           status = case_when(
             UniqueCount == NumReads & UniqueCount > 0 ~ "all_unique_reads",
             UniqueCount == 0 & NumReads == 0 & AmbigCount > 0 ~ "all_ambig_no_counts",
             multimap == NumReads ~ "all_ambig_reads",
             AmbigCount != NumReads & UniqueCount != NumReads & percent_est_from_multimap > 0.5 ~ "more_multi",
             AmbigCount != NumReads & UniqueCount != NumReads & percent_est_from_multimap < 0.5 ~ "more_unique",
             percent_est_from_multimap == 0.5 ~ "equal"
           ),
           status = factor(status, levels = c("all_unique_reads",
                                              "more_unique",
                                              "equal",
                                              "more_multi",
                                              "all_ambig_reads")))
  
  return(x)
}

Get Multi-mapping for each Aligner

HISAT2

hisat2_quants <- list()
for(i in 1:length(hisat2_quant_files)) {
  hisat2_quants[[i]] <- read_delim(
    hisat2_quant_files[[i]]
  )
}
names(hisat2_quants) <- list.files(here("data/counts/HISAT2/hisat2_quant"))

hisat2_ambigs <- list()
for(i in 1:length(hisat2_ambig_files)) {
  hisat2_ambigs[[i]] <- read_tsv(
    hisat2_ambig_files[[i]]
  )
}
names(hisat2_ambigs) <- list.files(here("data/counts/HISAT2/hisat2_quant"))


hisat2_multi_mapping <- list()
for(i in 1:length(hisat2_quants)) {
  hisat2_multi_mapping[[i]] <- multi_mapping(quant = hisat2_quants[[i]], 
                                             ambig = hisat2_ambigs[[i]])
}
names(hisat2_multi_mapping) <- names(hisat2_quants)

Selective Alignment

sa_quants <- list()
for(i in 1:length(sa_quant_files)) {
  sa_quants[[i]] <- read_delim(
    sa_quant_files[[i]]
  )
}
names(sa_quants) <- basename(sa_quant_files) %>%
  str_remove("quant_") %>%
  str_remove(".sf")

sa_ambigs <- list()
for(i in 1:length(sa_ambig_files)) {
  sa_ambigs[[i]] <- read_tsv(
    sa_ambig_files[[i]]
  )
}
names(sa_ambigs) <- basename(sa_ambig_files) %>%
  str_remove("_ambig_info") %>%
  str_remove(".tsv")

sa_multi_mapping <- list()
for(i in 1:length(sa_quants)) {
  sa_multi_mapping[[i]] <- multi_mapping(quant = sa_quants[[i]], 
                                         ambig = sa_ambigs[[i]])
}
names(sa_multi_mapping) <- names(sa_quants)

STAR

star_quants <- list()
for(i in 1:length(star_quant_files)) {
  star_quants[[i]] <- read_delim(
    star_quant_files[[i]]
  )
}
names(star_quants) <- basename(star_quant_files) %>%
  str_remove("quant_") %>%
  str_remove(".sf")

star_ambigs <- list()
for(i in 1:length(star_ambig_files)) {
  star_ambigs[[i]] <- read_tsv(
    star_ambig_files[[i]]
  )
}
names(star_ambigs) <- basename(star_ambig_files) %>%
  str_remove("_ambig_info") %>%
  str_remove(".tsv")

star_multi_mapping <- list()
for(i in 1:length(star_quants)) {
  star_multi_mapping[[i]] <- multi_mapping(quant = star_quants[[i]], 
                                           ambig = star_ambigs[[i]])
}
names(star_multi_mapping) <- names(star_quants)

Determine STAR ambiguous reads after Salmon

mm_rate <- numeric()
for(i in names(star_multi_mapping)) {
  sum_mm <- sum(star_multi_mapping[[i]][["multimap"]])
  sum_un <- sum(star_multi_mapping[[i]][["UniqueCount"]])
  rate <- sum_mm/(sum_mm+sum_un)
  mm_rate <- c(mm_rate, rate)
}
multi_nms <- c("SRR13401116", "SRR13401117", "SRR13401118", "SRR13401119",
               "SRR13401120", "SRR13401121", "SRR13401122", "SRR13401123")
mm_rate_df <- data.frame(multi_nms, mm_rate)
mean(mm_rate_df$mm_rate) # ~77.8%
## [1] 0.7775229

STAR numbers directly from log files

star_multimap <- c(8.14+0.08, 9.25+0.08, 9.98+0.1, 8.19+0.08, 
                   9.6+0.08, 9.04+0.08, 8.06+0.08, 7.94+0.08)
multi_nms <- c("SRR13401116", "SRR13401117", "SRR13401118", "SRR13401119",
               "SRR13401120", "SRR13401121", "SRR13401122", "SRR13401123")
star_multimap_rates <- data.frame(multi_nms, star_multimap) %>%
  set_colnames(c("sample_id", "multimap_rate"))
mean(star_multimap_rates$multimap_rate)
## [1] 8.8575

Create joined data frames

joined_df <- get_joined_samples(w = hisat2_dtelist,
                                x = kallisto_dtelist,
                                y = star_dtelist,
                                z = salmon_dtelist,
                                method_w = "Hisat2",
                                method_x = "Kallisto",
                                method_y = "STAR",
                                method_z = "Salmon",
                                sample = 1)

cor_df <- joined_df
cor_df$variance <- apply(cor_df[,-1], 1, var)
lowcor_ensdb <- cor_df %>%
  dplyr::arrange(desc(variance)) %>%
  head(200) %>%
  dplyr::select(transcript_id) %>%
  dplyr::left_join(gene_txp_anno,
                   by = "transcript_id")

lowcor_ensdb <- lowcor_ensdb %>%
  dplyr::select(-score,
                -phase) %>%
  select_if(~ !all(is.na(.))) %>%
  dplyr::select(-ccds_id,
                -transcript_support_level,
                -tag) %>%
  drop_na()

Create Multi-Mapping Group Lists

highcor_ensdb <- cor_df %>%
  dplyr::arrange(variance) %>%
  head(200) %>%
  dplyr::select(transcript_id) %>%
  dplyr::left_join(gene_txp_anno,
                   by = "transcript_id")

highcor_ensdb <- highcor_ensdb %>%
  dplyr::select(-score,
                -phase) %>%
  select_if(~ !all(is.na(.))) %>%
  dplyr::select(-ccds_id,
                -transcript_support_level,
                -tag) %>%
  drop_na()
group_list <- list()
group_list[["highcor"]] <- highcor_ensdb
group_list[["lowcor"]] <- lowcor_ensdb

group_list[["hisat2"]] <- hisat2_loadings
group_list[["kallisto"]] <- kallisto_loadings
group_list[["star"]] <- star_loadings
group_list[["salmon"]] <- salmon_loadings

group_list[["hisat2_unique"]] <- hisat2_unique
group_list[["kallisto_unique"]] <- kallisto_unique
group_list[["star_unique"]] <- star_unique
group_list[["salmon_unique"]] <- salmon_unique

group_list[["hisat2_dte"]] <- hisat2_dte_exclusive
group_list[["salmon_dte"]] <- salmon_dte_exclusive
group_list[["kallisto_dte"]] <- kallisto_dte_exclusive
group_list[["star_dte"]] <- star_dte_exclusive
#group_list[["all_methods_dte"]] <- all_methods_tx_dte

group_list[["hisat2_dtu"]] <- hisat2_dtu_exclusive
group_list[["kallisto_dtu"]] <- kallisto_dtu_exclusive
group_list[["star_dtu"]] <- star_dtu_exclusive
group_list[["salmon_dtu"]] <- salmon_dtu_exclusive
#group_list[["all_methods_dtu"]] <- all_methods_tx_dtu
multimap_list <- list()
multimap_list[["HISAT2"]] <- hisat2_multi_mapping
multimap_list[["STAR"]] <- star_multi_mapping
multimap_list[["Salmon"]] <- sa_multi_mapping 

Multi mapping plots

Prep for multi-map

group_name <- names(group_list)
method_name <- names(multimap_list)
samp_nms <- names(multimap_list$HISAT2)

multimap_group_list <- list()
for(i in group_name) {
  for(j in method_name) {
    for(k in samp_nms) {
      multimap_group_list[[i]][[j]][[k]] <- multimap_list[[j]][[k]] %>%
        dplyr::mutate(transcript_id = str_remove(Name, "\\..*")) %>%
        dplyr::filter(transcript_id %in% group_list[[i]]$transcript_id) 
    }
  }
}

Percentage of Multi-Mapping per Sample

for(i in group_name) {
  for(j in samp_nms) {
    message("Starting group: ", i," sample: ", j, " plot")
    multi_map_plot <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      ggplot(aes(x = group, y = percent_est_from_multimap)) +
      geom_quasirandom() +
      ggtitle(paste0("Sample: ", j)) +
      labs(x = "",
           y = "Multi-Mapping Reads (%)") +
      theme_bw() +
      theme(axis.title = element_text(colour = "black", size = 14),
            axis.text = element_text(colour = "black", size = 12),
            plot.title = element_text(colour = "black", size = 15, 
                                      face = "bold"))
    
    ggsave(plot = multi_map_plot,
           filename = paste0(j, "_percent_mm.png"),
           path = here(paste0("figures/multi_map/", i, "/percent_mm/")),
           height = 150,
           width = 175,
           dpi = 400,
           units = "mm", create.dir = TRUE)
  }
}

Number of Reads by Method

for(i in group_name) {
  for(j in samp_nms) {
    message("Starting group: ", i," sample: ", j, " plot")
    num_reads_plot <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      ggplot(aes(x = group, y = NumReads)) +
      geom_quasirandom() +
      ggtitle(paste0("Sample: ", i)) +
      labs(x = "",
           y = "Number of Reads") +
      theme_bw() +
      theme(axis.title = element_text(colour = "black", size = 14),
            axis.text = element_text(colour = "black", size = 12),
            plot.title = element_text(colour = "black", size = 15, 
                                      face = "bold"))
    
    ggsave(plot = num_reads_plot,
           filename = paste0(j, "_numreads.png"),
           path = here(paste0("figures/multi_map/", i, "/number_of_reads/")),
           height = 150,
           width = 175,
           dpi = 400,
           units = "mm", create.dir = TRUE)
  }
}

Counts by Method

for(i in group_name) {
  for(j in samp_nms) {
    message("Starting group: ", i," sample: ", j, " plot")
    
    ambig_counts_plot <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      dplyr::select(group, UniqueCount, AmbigCount) %>%
      pivot_longer(cols = c("UniqueCount", "AmbigCount"),
                   names_to = "type",
                   values_to = "counts") %>%
      dplyr::mutate(type = case_when(
          type == "AmbigCount" ~ "Multi-Mapped",
          type == "UniqueCount" ~ "Uniquely Mapped"),
        type = factor(type, levels = c("Uniquely Mapped", "Multi-Mapped"))) %>%
      dplyr::mutate(counts = counts + 1) %>%
      ggplot(aes(x = type, y = log2(counts))) +
      geom_quasirandom(colour = "darkorange") +
      geom_boxplot(fill = "blue", colour = "black", width = 0.2,
                   outlier.shape = NA) +
      #scale_colour_manual(values = c("darkorange", "darkblue")) +
      ggtitle(paste0("Sample: ", j)) +
      labs(y = "Number of Reads (log2)",
           x = "",
           colour = "") +
      facet_wrap(~ group) +
      theme_bw() +
      theme(axis.title = element_text(colour = "black", size = 14),
            axis.text = element_text(colour = "black", size = 12),
            strip.background = element_rect(fill = "lightblue"),
            strip.text = element_text(colour = "black", size = 14))
    
    ggsave(plot = ambig_counts_plot,
           filename = paste0(j, "_mm_counts.png"),
           path = here(paste0("figures/multi_map/", i, "/counts_mm/")),
           height = 200,
           width = 300,
           dpi = 400,
           units = "mm", create.dir = TRUE)
  }
}

Ambig Ratios by Method

for(i in group_name) {
  for(j in samp_nms) {
    message("Starting group: ", i," sample: ", j, " plot")
    
    ambig_counts_plot <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      dplyr::select(group, UniqueCount, AmbigCount) %>%
      dplyr::mutate(ratio = log2((UniqueCount+1)/(AmbigCount+1)),
                    category = case_when(
                      ratio > 0 ~ "More Unique Mappings",
                      .default = "More Multiple Mappings"
                    ),
                    category = factor(category,
                                      levels = c("More Unique Mappings",
                                                 "More Multiple Mappings")
                    )) %>%
      pivot_longer(cols = "ratio",
                   names_to = "type",
                   values_to = "counts") %>%
      ggplot(aes(x = group, y = counts, colour = category)) +
      geom_quasirandom() +
      geom_boxplot(fill = "lightblue", colour = "black", width = 0.2,
                   outlier.shape = NA) +
      scale_y_continuous(limits = c(-20, 20)) +
      scale_colour_manual(values = c("darkblue", "darkorange")) +
      ggtitle(paste0("Sample: ", j)) +
      labs(y = "Log2 Ratio (Unique mappings/Multiple mappings)",
           x = "",
           colour = "") +
      #facet_wrap(~ group) +
      theme_bw() +
      theme(axis.title = element_text(colour = "black", size = 12),
            axis.text = element_text(colour = "black", size = 12),
            strip.background = element_rect(fill = "lightblue"),
            strip.text = element_text(colour = "black", size = 14))
    
    ggsave(plot = ambig_counts_plot,
           filename = paste0(j, "_mm_ratios.png"),
           path = here(paste0("figures/multi_map/", i, "/ratios_mm/")),
           height = 200,
           width = 300,
           dpi = 400,
           units = "mm", create.dir = TRUE)
  }
}

for(j in samp_nms) {
  ratio_df <- data.frame()
  for(i in group_name[3:6]) {
    message("Starting group: ", i," sample: ", j, " plot")
    
    ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      dplyr::mutate(group = factor(group,
                                   levels = c("HISAT2", "Salmon", "STAR"))) %>%
      dplyr::select(group, UniqueCount, AmbigCount) %>%
      dplyr::mutate(ratio = log2((UniqueCount+1)/(AmbigCount+1)),
                    category = case_when(
                      ratio > 0 ~ "More Unique\nMappings",
                      .default = "More Multiple\nMappings"
                    ),
                    category = factor(category,
                                      levels = c("More Unique\nMappings",
                                                 "More Multiple\nMappings")
                    )) %>%
      pivot_longer(cols = "ratio",
                   names_to = "type",
                   values_to = "counts") %>%
      dplyr::mutate(tx_group = case_when(
        i == "highcor" ~ "Concordant Transcripts",
        i == "lowcor" ~ "Discordant Transcripts",
        i == "hisat2" ~ "Top Negative\nPC1 Loadings",
        i == "kallisto" ~ "Top Negative\nPC2 Loadings",
        i == "star" ~ "Top Positive\nPC2 Loadings",
        i == "salmon" ~ "Top Negative\nPC5 Loadings"
      ))
    ratio_df <- rbind(ratio_df, ratio_mm)
  }
  ratio_plot <- ratio_df %>%
    ggplot(aes(x = group, y = counts, colour = category)) +
    geom_quasirandom() +
    geom_boxplot(fill = "yellow", colour = "black", width = 0.2,
                 outlier.shape = NA) +
    scale_y_continuous(limits = c(-20, 20)) +
    scale_colour_manual(values = c("darkblue", "darkorange")) +
    ggtitle(paste0("Sample: ", j)) +
    labs(y = "Log2 Ratio (Unique mappings/Multiple mappings)",
         x = "",
         colour = "Category") +
    guides(colour = guide_legend(byrow = TRUE)) +
    facet_wrap(~ tx_group, nrow = 1, ncol = 4) +
    theme_bw() +
    theme(plot.title = element_text(colour = "black", size = 18),
          axis.title = element_text(colour = "black", size = 18),
          axis.text = element_text(colour = "black", size = 16),
          strip.background = element_rect(fill = "lightblue"),
          strip.text = element_text(colour = "black", size = 18),
          legend.text = element_text(colour = "black", size = 16),
          legend.title = element_text(colour = "black", size = 18),
          legend.spacing.y = unit(0.5, "cm"))
  
  ggsave(plot = ratio_plot,
         filename = paste0(j, "_mm_ratios.png"),
         path = here(paste0("figures/multi_map/all_groups_ratio/")),
         height = 280,
         width = 420,
         dpi = 400,
         units = "mm", create.dir = TRUE)
}
for(j in samp_nms) {
  # First do concordant and discordant
  cd_ratio_df <- data.frame()
  for(i in group_name[1:2]) {
    message("Starting group: ", i," sample: ", j, " plot")
    
    ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      dplyr::mutate(group = factor(group,
                                   levels = c("HISAT2", "Salmon", "STAR"))) %>%
      dplyr::select(group, UniqueCount, AmbigCount) %>%
      dplyr::mutate(ratio = log2((UniqueCount+1)/(AmbigCount+1)),
                    category = case_when(
                      ratio > 0 ~ "More Unique\nMappings",
                      .default = "More Multiple\nMappings"
                    ),
                    category = factor(category,
                                      levels = c("More Unique\nMappings",
                                                 "More Multiple\nMappings")
                    )) %>%
      pivot_longer(cols = "ratio",
                   names_to = "type",
                   values_to = "counts") %>%
      dplyr::mutate(tx_group = case_when(
        i == "highcor" ~ "Concordant Transcripts",
        i == "lowcor" ~ "Discordant Transcripts",
        i == "hisat2" ~ "Top Negative\nPC1 Loadings",
        i == "kallisto" ~ "Top Negative\nPC2 Loadings",
        i == "star" ~ "Top Positive\nPC2 Loadings",
        i == "salmon" ~ "Top Negative\nPC5 Loadings"
      ))
    cd_ratio_df <- rbind(cd_ratio_df, ratio_mm)
  }
  cd_ratio_plot <- cd_ratio_df %>%
    ggplot(aes(x = group, y = counts, colour = category)) +
    geom_quasirandom() +
    geom_boxplot(fill = "yellow", colour = "black", width = 0.2,
                 outlier.shape = NA) +
    scale_y_continuous(limits = c(-20, 20)) +
    scale_colour_manual(values = c("darkblue", "darkorange")) +
    ggtitle(paste0("Sample: ", j)) +
    labs(colour = "Category") +
    guides(colour = guide_legend(byrow = TRUE)) +
    facet_wrap(~ tx_group, nrow = 1, ncol = 2) +
    theme_bw() +
    theme(plot.title = element_text(colour = "black", size = 18),
          axis.title = element_blank(),
          axis.text = element_text(colour = "black", size = 16),
          strip.background = element_rect(fill = "lightblue"),
          strip.text = element_text(colour = "black", size = 18),
          legend.position = "none",
          legend.text = element_text(colour = "black", size = 16),
          legend.title = element_text(colour = "black", size = 18),
          legend.spacing.y = unit(0.5, "cm"))
  
  # Then do PC loadings
  pc_ratio_df <- data.frame()
  for(i in group_name[3:6]) {
    message("Starting group: ", i," sample: ", j, " plot")
    
    ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      dplyr::mutate(group = factor(group,
                                   levels = c("HISAT2", "Salmon", "STAR"))) %>%
      dplyr::select(group, UniqueCount, AmbigCount) %>%
      dplyr::mutate(ratio = log2((UniqueCount+1)/(AmbigCount+1)),
                    category = case_when(
                      ratio > 0 ~ "More Unique\nMappings",
                      .default = "More Multiple\nMappings"
                    ),
                    category = factor(category,
                                      levels = c("More Unique\nMappings",
                                                 "More Multiple\nMappings")
                    )) %>%
      pivot_longer(cols = "ratio",
                   names_to = "type",
                   values_to = "counts") %>%
      dplyr::mutate(tx_group = case_when(
        i == "highcor" ~ "Concordant Transcripts",
        i == "lowcor" ~ "Discordant Transcripts",
        i == "hisat2" ~ "Top Negative\nPC1 Loadings",
        i == "kallisto" ~ "Top Negative\nPC2 Loadings",
        i == "star" ~ "Top Positive\nPC2 Loadings",
        i == "salmon" ~ "Top Negative\nPC5 Loadings"
      ))
    pc_ratio_df <- rbind(pc_ratio_df, ratio_mm)
  }
  
  pc_ratio_plot <- pc_ratio_df %>%
    ggplot(aes(x = group, y = counts, colour = category)) +
    geom_quasirandom() +
    geom_boxplot(fill = "yellow", colour = "black", width = 0.2,
                 outlier.shape = NA) +
    scale_y_continuous(limits = c(-20, 20)) +
    scale_colour_manual(values = c("darkblue", "darkorange")) +
    #ggtitle(paste0("Sample: ", j)) +
    labs(colour = "Category") +
    guides(colour = guide_legend(byrow = TRUE)) +
    facet_wrap(~ tx_group, nrow = 1, ncol = 4) +
    theme_bw() +
    theme(plot.title = element_text(colour = "black", size = 18),
          axis.title = element_blank(),
          axis.text = element_text(colour = "black", size = 16),
          strip.background = element_rect(fill = "lightblue"),
          strip.text = element_text(colour = "black", size = 18),
          legend.text = element_text(colour = "black", size = 16),
          legend.title = element_text(colour = "black", size = 18),
          legend.spacing.y = unit(0.5, "cm"))
  
  fig_leg <- cowplot::get_legend(pc_ratio_plot)
  
  ylab <- textGrob("Log2 Ratio (Unique mappings/Multiple mappings)",
                 gp = gpar(fontface = "bold",
                           col = "black",
                           fontsize = 20),
                 rot = 90)

  plot_joined <- cowplot::plot_grid(
    cowplot::plot_grid(cd_ratio_plot,
                       pc_ratio_plot + theme(legend.position = "none"),
                       ncol = 1,
                       labels = c("A)", "B)")),
    fig_leg,
    nrow = 1,
    rel_widths = c(1, 0.15))
  
  fig_5 <- grid.arrange(arrangeGrob(plot_joined, left = ylab))
  
  ggsave(plot = fig_5,
         filename = paste0(j, "_ratios_ambig_fig5.png"),
         path = here(paste0("figures/chapter_figures/figure5_ratios")),
         height = 340,
         width = 420,
         dpi = 400,
         units = "mm", create.dir = TRUE)
}

Multi Ratios by Method

for(i in group_name) {
  for(j in samp_nms) {
    message("Starting group: ", i," sample: ", j, " plot")
    
    ambig_counts_plot <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      dplyr::select(group, UniqueCount, multimap) %>%
      dplyr::mutate(ratio = log2((UniqueCount+1)/(multimap+1)),
                    category = case_when(
                      ratio > 0 ~ "More Unique Mappings",
                      .default = "More Multiple Mappings"
                    ),
                    category = factor(category,
                                      levels = c("More Unique Mappings",
                                                 "More Multiple Mappings")
                    )) %>%
      pivot_longer(cols = "ratio",
                   names_to = "type",
                   values_to = "counts") %>%
      ggplot(aes(x = group, y = counts, colour = category)) +
      geom_quasirandom() +
      geom_boxplot(fill = "lightblue", colour = "black", width = 0.2,
                   outlier.shape = NA) +
      scale_y_continuous(limits = c(-20, 20)) +
      scale_colour_manual(values = c("darkblue", "darkorange")) +
      ggtitle(paste0("Sample: ", j)) +
      labs(y = "Log2 Ratio (Unique mappings/Multiple mappings)",
           x = "",
           colour = "") +
      #facet_wrap(~ group) +
      theme_bw() +
      theme(axis.title = element_text(colour = "black", size = 12),
            axis.text = element_text(colour = "black", size = 12),
            strip.background = element_rect(fill = "lightblue"),
            strip.text = element_text(colour = "black", size = 14))
    
    ggsave(plot = ambig_counts_plot,
           filename = paste0(j, "_mm_ratios_multi.png"),
           path = here(paste0("figures/multi_map/", i, "/ratios_mm/")),
           height = 150,
           width = 175,
           dpi = 400,
           units = "mm", create.dir = TRUE)
  }
}

for(j in samp_nms) {
  ratio_df <- data.frame()
  for(i in group_name[3:6]) {
    message("Starting group: ", i," sample: ", j, " plot")
    
    ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      dplyr::mutate(group = factor(group,
                                   levels = c("HISAT2", "Salmon", "STAR"))) %>%
      dplyr::select(group, UniqueCount, multimap) %>%
      dplyr::mutate(ratio = log2((UniqueCount+1)/(multimap+1)),
                    category = case_when(
                      ratio > 0 ~ "More Unique\nMappings",
                      .default = "More Multiple\nMappings"
                    ),
                    category = factor(category,
                                      levels = c("More Unique\nMappings",
                                                 "More Multiple\nMappings")
                    )) %>%
      pivot_longer(cols = "ratio",
                   names_to = "type",
                   values_to = "counts") %>%
      dplyr::mutate(tx_group = case_when(
        i == "highcor" ~ "Concordant Transcripts",
        i == "lowcor" ~ "Discordant Transcripts",
        i == "hisat2" ~ "Top Negative\nPC1 Loadings",
        i == "kallisto" ~ "Top Negative\nPC2 Loadings",
        i == "star" ~ "Top Positive\nPC2 Loadings",
        i == "salmon" ~ "Top Negative\nPC5 Loadings"
      ),
      tx_group = factor(tx_group,
                        levels = c("Concordant Transcripts",
                                   "Discordant Transcripts",
                                   "Top Negative\nPC1 Loadings",
                                   "Top Positive\nPC2 Loadings",
                                   "Top Negative\nPC2 Loadings",
                                   "Top Negative\nPC5 Loadings"))) %>%
      dplyr::filter(!counts == 0)
    ratio_df <- rbind(ratio_df, ratio_mm)
  }
  ratio_plot <- ratio_df %>%
    ggplot(aes(x = group, y = counts, colour = category)) +
    geom_quasirandom() +
    geom_boxplot(fill = "yellow", colour = "black", width = 0.2,
                 outlier.shape = NA) +
    scale_y_continuous(limits = c(-20, 20)) +
    scale_colour_manual(values = c("darkblue", "darkorange")) +
    ggtitle(paste0("Sample: ", j)) +
    labs(y = "Log2 Ratio (Unique mappings/Multiple mappings)",
         x = "",
         colour = "Category") +
    guides(colour = guide_legend(byrow = TRUE)) +
    facet_wrap(~ tx_group, nrow = 1, ncol = 4) +
    theme_bw() +
    theme(plot.title = element_text(colour = "black", size = 18),
          axis.title = element_text(colour = "black", size = 18),
          axis.text = element_text(colour = "black", size = 16),
          strip.background = element_rect(fill = "lightblue"),
          strip.text = element_text(colour = "black", size = 18),
          legend.text = element_text(colour = "black", size = 16),
          legend.title = element_text(colour = "black", size = 18),
          legend.spacing.y = unit(0.5, "cm"))
  
  ggsave(plot = ratio_plot,
         filename = paste0(j, "_mm_multi_ratios.png"),
         path = here(paste0("figures/multi_map/all_groups_ratio/")),
         height = 280,
         width = 420,
         dpi = 400,
         units = "mm", create.dir = TRUE)
}
for(j in samp_nms) {
  # First do concordant and discordant
  cd_ratio_df <- data.frame()
  for(i in group_name[1:2]) {
    message("Starting group: ", i," sample: ", j, " plot")
    
    ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      dplyr::mutate(group = factor(group,
                                   levels = c("HISAT2", "Salmon", "STAR"))) %>%
      dplyr::select(group, UniqueCount, multimap) %>%
      dplyr::mutate(ratio = log2((UniqueCount+1)/(multimap+1)),
                    category = case_when(
                      ratio > 0 ~ "More Unique\nMappings",
                      .default = "More Multiple\nMappings"
                    ),
                    category = factor(category,
                                      levels = c("More Unique\nMappings",
                                                 "More Multiple\nMappings")
                    )) %>%
      pivot_longer(cols = "ratio",
                   names_to = "type",
                   values_to = "counts") %>%
      dplyr::mutate(tx_group = case_when(
        i == "highcor" ~ "Concordant Transcripts",
        i == "lowcor" ~ "Discordant Transcripts",
        i == "hisat2" ~ "Top Negative\nPC1 Loadings",
        i == "kallisto" ~ "Top Negative\nPC2 Loadings",
        i == "star" ~ "Top Positive\nPC2 Loadings",
        i == "salmon" ~ "Top Negative\nPC5 Loadings"
      ),
      tx_group = factor(tx_group,
                        levels = c("Concordant Transcripts",
                                   "Discordant Transcripts",
                                   "Top Negative\nPC1 Loadings",
                                   "Top Positive\nPC2 Loadings",
                                   "Top Negative\nPC2 Loadings",
                                   "Top Negative\nPC5 Loadings"))) %>%
      dplyr::filter(!counts == 0)
    cd_ratio_df <- rbind(cd_ratio_df, ratio_mm)
  }
  cd_ratio_plot <- cd_ratio_df %>%
    ggplot(aes(x = group, y = counts, colour = category)) +
    geom_quasirandom() +
    geom_boxplot(fill = "yellow", colour = "black", width = 0.2,
                 outlier.shape = NA) +
    scale_y_continuous(limits = c(-20, 20)) +
    scale_colour_manual(values = c("darkblue", "darkorange")) +
    ggtitle(paste0("Sample: ", j)) +
    labs(colour = "Category") +
    guides(colour = guide_legend(byrow = TRUE)) +
    facet_wrap(~ tx_group, nrow = 1, ncol = 2) +
    theme_bw() +
    theme(plot.title = element_text(colour = "black", size = 18),
          axis.title = element_blank(),
          axis.text = element_text(colour = "black", size = 16),
          strip.background = element_rect(fill = "lightblue"),
          strip.text = element_text(colour = "black", size = 18),
          legend.position = "none",
          legend.text = element_text(colour = "black", size = 16),
          legend.title = element_text(colour = "black", size = 18),
          legend.spacing.y = unit(0.5, "cm"))
  
  # Then do PC loadings
  pc_ratio_df <- data.frame()
  for(i in group_name[3:6]) {
    message("Starting group: ", i," sample: ", j, " plot")
    
    ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      dplyr::mutate(group = factor(group,
                                   levels = c("HISAT2", "Salmon", "STAR"))) %>%
      dplyr::select(group, UniqueCount, multimap) %>%
      dplyr::mutate(ratio = log2((UniqueCount+1)/(multimap+1)),
                    category = case_when(
                      ratio > 0 ~ "More Unique\nMappings",
                      .default = "More Multiple\nMappings"
                    ),
                    category = factor(category,
                                      levels = c("More Unique\nMappings",
                                                 "More Multiple\nMappings")
                    )) %>%
      pivot_longer(cols = "ratio",
                   names_to = "type",
                   values_to = "counts") %>%
      dplyr::mutate(tx_group = case_when(
        i == "highcor" ~ "Concordant Transcripts",
        i == "lowcor" ~ "Discordant Transcripts",
        i == "hisat2" ~ "Top Negative\nPC1 Loadings",
        i == "kallisto" ~ "Top Negative\nPC2 Loadings",
        i == "star" ~ "Top Positive\nPC2 Loadings",
        i == "salmon" ~ "Top Negative\nPC5 Loadings"
      ),
      tx_group = factor(tx_group, levels = c("Concordant Transcripts",
                                             "Discordant Transcripts",
                                             "Top Negative\nPC1 Loadings",
                                             "Top Positive\nPC2 Loadings",
                                             "Top Negative\nPC2 Loadings",
                                             "Top Negative\nPC5 Loadings")))
    pc_ratio_df <- rbind(pc_ratio_df, ratio_mm)
  }
  
  pc_ratio_plot <- pc_ratio_df %>%
    ggplot(aes(x = group, y = counts, colour = category)) +
    geom_quasirandom() +
    geom_boxplot(fill = "yellow", colour = "black", width = 0.2,
                 outlier.shape = NA) +
    scale_y_continuous(limits = c(-20, 20)) +
    scale_colour_manual(values = c("darkblue", "darkorange")) +
    #ggtitle(paste0("Sample: ", j)) +
    labs(colour = "Category") +
    guides(colour = guide_legend(byrow = TRUE)) +
    facet_wrap(~ tx_group, nrow = 1, ncol = 4) +
    theme_bw() +
    theme(plot.title = element_text(colour = "black", size = 18),
          axis.title = element_blank(),
          axis.text = element_text(colour = "black", size = 16),
          strip.background = element_rect(fill = "lightblue"),
          strip.text = element_text(colour = "black", size = 18),
          legend.text = element_text(colour = "black", size = 16),
          legend.title = element_text(colour = "black", size = 18),
          legend.spacing.y = unit(0.5, "cm"))
  
  fig_leg <- cowplot::get_legend(pc_ratio_plot)
  
  ylab <- textGrob("Log2 Ratio (Unique mappings/Multiple mappings)",
                 gp = gpar(fontface = "bold",
                           col = "black",
                           fontsize = 20),
                 rot = 90)

  plot_joined <- cowplot::plot_grid(
    cowplot::plot_grid(cd_ratio_plot,
                       pc_ratio_plot + theme(legend.position = "none"),
                       ncol = 1,
                       labels = c("A)", "B)")),
    fig_leg,
    nrow = 1,
    rel_widths = c(1, 0.15))
  
  fig_5 <- grid.arrange(arrangeGrob(plot_joined, left = ylab))
  
  ggsave(plot = fig_5,
         filename = paste0(j, "_ratios_multi_fig5.png"),
         path = here(paste0("figures/chapter_figures/figure5_ratios")),
         height = 340,
         width = 420,
         dpi = 400,
         units = "mm", create.dir = TRUE)
}

Figure 5 Ratios

ratio_df_fig <- data.frame()
cor_groups <- c("highcor", "lowcor")
for(i in cor_groups) {
  for(j in samp_nms) {
    ratio_mm <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      dplyr::mutate(group = factor(group,
                                   levels = c("HISAT2", "Salmon", "STAR"))) %>%
      dplyr::select(group, UniqueCount, multimap) %>%
      dplyr::mutate(ratio = log2((UniqueCount+1)/(multimap+1)),
                    category = case_when(
                      ratio > 0 ~ "More Unique\nMappings",
                      .default = "More Multiple\nMappings"
                    ),
                    category = factor(category,
                                      levels = c("More Unique\nMappings",
                                                 "More Multiple\nMappings")
                    )) %>%
      pivot_longer(cols = "ratio",
                   names_to = "type",
                   values_to = "counts") %>%
      dplyr::mutate(sample = j,
                    tx_group = i)
    
    ratio_df_fig <- rbind(ratio_df_fig, ratio_mm)
  }
}

figure5_mm <- ratio_df_fig %>%
  dplyr::mutate(tx_group = case_when(
    tx_group == "highcor" ~ "Concordant Transcripts",
    tx_group == "lowcor" ~ "Discordant Transcripts")) %>%
  ggplot(aes(x = group, y = counts, colour = category)) +
  geom_quasirandom(size = 1) +
  scale_y_continuous(limits = c(-20, 20)) +
  scale_colour_manual(values = c("darkblue", "darkorange")) +
  #ggtitle(paste0("Sample: ", j)) +
  labs(colour = "Category") +
  guides(colour = guide_legend(byrow = TRUE)) +
  facet_wrap(~ tx_group, nrow = 1, ncol = 4) +
  theme_bw() +
  theme(plot.title = element_text(colour = "black", size = 18),
        axis.title = element_blank(),
        axis.text = element_text(colour = "black", size = 16),
        strip.background = element_rect(fill = "lightblue"),
        strip.text = element_text(colour = "black", size = 18),
        legend.text = element_text(colour = "black", size = 16),
        legend.title = element_text(colour = "black", size = 18),
        legend.spacing.y = unit(0.5, "cm"))

ggsave(plot = figure5_mm,
       filename = "figure5_multimapping.png",
       path = here("figures/chapter_figures"),
       height = 225,
       width = 300,
       dpi = 400,
       units = "mm", create.dir = TRUE)

Fig 5 Ratio investigate

hisat2_con_ratios <- cd_ratio_df %>%
  dplyr::filter(tx_group == "Concordant Transcripts", group == "HISAT2") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                  .default = times_higher))

summary(hisat2_con_ratios$times_higher)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -1586.075    -4.347     1.877   -18.064    11.276   144.881
salmon_con_ratios <- cd_ratio_df %>%
  dplyr::filter(tx_group == "Concordant Transcripts", group == "Salmon") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(salmon_con_ratios$times_higher)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -4296.930    -7.558     1.486   -99.190     8.400   167.975
star_con_ratios <- cd_ratio_df %>%
  dplyr::filter(tx_group == "Concordant Transcripts", group == "STAR") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(star_con_ratios$times_higher)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -4047.655    -4.370     1.882   396.379    11.644 10614.000
concordant_ratios <- rbind(hisat2_con_ratios,
                           salmon_con_ratios,
                           star_con_ratios)
hisat2_dis_ratios <- cd_ratio_df %>%
  dplyr::filter(tx_group == "Discordant Transcripts", group == "HISAT2") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(hisat2_dis_ratios$times_higher)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -5021.96  -232.51   -88.19  -187.67   -32.87   130.19
salmon_dis_ratios <- cd_ratio_df %>%
  dplyr::filter(tx_group == "Discordant Transcripts", group == "Salmon") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(salmon_dis_ratios$times_higher)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -9411.03  -480.89  -259.51  -447.38   -66.95   413.15
star_dis_ratios <- cd_ratio_df %>%
  dplyr::filter(tx_group == "Discordant Transcripts", group == "STAR") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(star_dis_ratios$times_higher)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -12259.32   -364.95   -161.24   -322.69    -35.62   1389.00
discordant_ratios <- rbind(hisat2_dis_ratios,
                           salmon_dis_ratios,
                           star_dis_ratios)
hisat2_pc1_neg_ratios <- pc_ratio_df %>%
  dplyr::filter(tx_group == "Top Negative\nPC1 Loadings", group == "HISAT2") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(hisat2_pc1_neg_ratios$times_higher)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -5021.961  -138.875   -24.714  -139.224    -3.122     9.406
salmon_pc1_neg_ratios <- pc_ratio_df %>%
  dplyr::filter(tx_group == "Top Negative\nPC1 Loadings", group == "Salmon") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(salmon_pc1_neg_ratios$times_higher)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -4250.183  -121.696   -20.121  -157.552    -3.811     5.053
star_pc1_neg_ratios <- pc_ratio_df %>%
  dplyr::filter(tx_group == "Top Negative\nPC1 Loadings", group == "STAR") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(star_pc1_neg_ratios$times_higher)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -12259.315    -30.072    -11.430   -147.140     -2.443     10.811
pc1_neg_ratios <- rbind(hisat2_pc1_neg_ratios, 
                        salmon_pc1_neg_ratios,
                        star_pc1_neg_ratios)
hisat2_pc2_pos_ratios <- pc_ratio_df %>%
  dplyr::filter(tx_group == "Top Positive\nPC2 Loadings", group == "HISAT2") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(hisat2_pc2_pos_ratios$times_higher)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -5021.96   -80.56   -21.80  -145.65    -6.10    90.43
salmon_pc2_pos_ratios <- pc_ratio_df %>%
  dplyr::filter(tx_group == "Top Positive\nPC2 Loadings", group == "Salmon") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(salmon_pc2_pos_ratios$times_higher)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -4497.835  -374.457   -86.317  -377.990    -8.501     9.471
star_pc2_pos_ratios <- pc_ratio_df %>%
  dplyr::filter(tx_group == "Top Positive\nPC2 Loadings", group == "STAR") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(star_pc2_pos_ratios$times_higher)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -42026.59   -356.28    -19.24   -633.26     -3.51   2334.00
pc2_pos_ratios <- rbind(hisat2_pc2_pos_ratios,
                        salmon_pc2_pos_ratios,
                        star_pc2_pos_ratios)
hisat2_pc2_neg_ratios <- pc_ratio_df %>%
  dplyr::filter(tx_group == "Top Negative\nPC2 Loadings", group == "HISAT2") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(hisat2_pc2_neg_ratios$times_higher)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1419.73  -226.90   -71.67  -187.14   -27.06   130.19
salmon_pc2_neg_ratios <- pc_ratio_df %>%
  dplyr::filter(tx_group == "Top Negative\nPC2 Loadings", group == "Salmon") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(salmon_pc2_neg_ratios$times_higher)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -19947.10   -580.14   -260.27   -633.40    -70.01    413.15
star_pc2_neg_ratios <- pc_ratio_df %>%
  dplyr::filter(tx_group == "Top Negative\nPC2 Loadings", group == "STAR") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(star_pc2_neg_ratios$times_higher)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -1226.747  -258.247   -69.892  -172.587    -7.547   528.000
pc2_neg_ratios <- rbind(hisat2_pc2_neg_ratios,
                        salmon_pc2_neg_ratios,
                        star_pc2_neg_ratios)
hisat2_pc5_neg_ratios <- pc_ratio_df %>%
  dplyr::filter(tx_group == "Top Negative\nPC5 Loadings", group == "HISAT2") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(hisat2_pc5_neg_ratios$times_higher)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1383.40  -232.06   -97.52  -170.73   -41.25    90.43
salmon_pc5_neg_ratios <- pc_ratio_df %>%
  dplyr::filter(tx_group == "Top Negative\nPC5 Loadings", group == "Salmon") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(salmon_pc5_neg_ratios$times_higher)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3524.15  -503.76  -287.87  -405.40  -127.23    88.05
star_pc5_neg_ratios <- pc_ratio_df %>%
  dplyr::filter(tx_group == "Top Negative\nPC5 Loadings", group == "STAR") %>%
  mutate(times_higher = 2^counts,
         times_higher = case_when(times_higher < 1 ~ ((1/times_higher)*-1),
                                .default = times_higher))

summary(star_pc5_neg_ratios$times_higher)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1986.26  -355.03  -170.33  -258.90   -63.08   611.00
pc5_neg_ratios <- rbind(hisat2_pc5_neg_ratios,
                        salmon_pc5_neg_ratios,
                        star_pc5_neg_ratios)

Categories

Start comparing multimapping rates between methods

for(k in group_name) {
  message("Starting group ", k)
  for(j in samp_nms) {
    message("Starting Sample: ", j)
    mm_df <- data.frame()
    
    for(i in method_name) {
      message("Starting Method ", i)
      statuses_df <- multimap_group_list[[k]][[i]][[j]] %>%
        dplyr::filter(!is.na(status)) %>%
        dplyr::mutate(status = str_to_title(
          str_replace_all(string = status,
                          pattern = "_", 
                          replacement = " "),),
          status = str_replace(status, "Multi", "Ambig"),
          status = factor(status, levels = c("All Unique Reads",
                                             "More Unique",
                                             "Equal",
                                             "More Ambig",
                                             "All Ambig Reads"))) %>%
        dplyr::mutate(method = i)
      
      mm_df <- rbind(mm_df, statuses_df)
    }
    
    statuses_plot <- mm_df %>%
      ggplot(aes(x = status, fill = status)) +
      geom_bar(colour = "black") +
      labs(y = "Frequency",
           x = "",
           fill = "Degree of\nMulti-Mapping") +
      ggtitle(paste0("Sample: ", j)) +
      scale_fill_viridis_d() +
      facet_grid(~ method) +
      theme_bw() +
      theme(
        axis.title = element_text(colour = "black", size = 14),
        axis.text.y = element_text(colour = "black", size = 12),
        plot.title = element_text(colour = "black", size = 14, face = "bold"),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        strip.background = element_rect(fill = "lightblue"),
        strip.text = element_text(colour = "black", size = 14, face = "bold"),
        legend.title = element_text(colour = "black", 
                                    size = 12, face = "bold")
        )
    
    tryCatch({
      ggsave(plot = statuses_plot,
             filename = paste0(j, "_mapping_categories.png"),
             path = here(paste0("figures/multi_map/", k, "/categories/")),
             height = 150,
             width = 175,
             dpi = 400,
             units = "mm", create.dir = TRUE)
    }, error = function(e) {
      cat("No data found for iteration", k, ":", j)
    })
  }
}
## No data found for iteration kallisto_unique : SRR13401117
## No data found for iteration kallisto_unique : SRR13401118
## No data found for iteration kallisto_unique : SRR13401119
## No data found for iteration kallisto_unique : SRR13401120
## No data found for iteration kallisto_unique : SRR13401121
## No data found for iteration kallisto_unique : SRR13401122

Percentage ambig

for(k in group_name) {
  for(i in method_name) {
    for(j in samp_nms) {
      message("Starting Group ", k, ", Method ", i, ", Sample: ", j, " plot")
      
      percent_numreads <- multimap_group_list[[k]][[i]][[j]] %>%
        ggplot(aes(x = NumReads, y = 100*percent_est_from_multimap)) +
        geom_point(size = 3) +
        ggtitle(paste0("Method: ", i, ", Sample: ", j)) +
        labs(x = "Number of Reads",
             y = "Reads that Multi-Mapped (%)") +
        theme_bw() +
        theme(
          axis.title = element_text(colour = "black", size = 14),
          axis.text = element_text(colour = "black", size = 12),
          plot.title = element_text(colour = "black", size = 14,
                                    face = "bold")
          )
      
      tryCatch({
        ggsave(plot = percent_numreads,
               filename = paste0(j, "_percent_numreads.png"),
               path = paste0(here("figures/multi_map/",
                                  k,
                                  "/percent_numreads/"),
                             tolower(i), "/"),
               height = 150,
               width = 175,
               dpi = 400,
               units = "mm", create.dir = TRUE)
      }, error = function(e) {
        cat("No data found for iteration", k, ":", j)
      })
    }
  }
}

for(i in group_name) {
  for(j in samp_nms) {
    message("Starting Group ", i, ", Sample: ", j, " plot")
    
    percent_numreads <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      ggplot(aes(x = NumReads, y = 100*percent_est_from_multimap)) +
      geom_point(size = 2) +
      ggtitle(paste0("Sample: ", j)) +
      labs(x = "Number of Reads",
           y = "Reads that Multi-Mapped (%)") +
      facet_grid(~ group) +
      theme_bw() +
      theme(
        axis.title = element_text(colour = "black", size = 14),
        axis.text = element_text(colour = "black", size = 12),
        plot.title = element_text(colour = "black", size = 14,
                                  face = "bold"),
        strip.background = element_rect(fill = "lightblue")
      )
    
    tryCatch({
    ggsave(plot = percent_numreads,
           filename = paste0(j, "_percent_numreads.png"),
           path = paste0(here("figures/multi_map/",
                              i,
                              "/percent_numreads/")),
           height = 100,
           width = 250,
           dpi = 400,
           units = "mm", create.dir = TRUE)
    }, error = function(e) {
      cat("No data found for iteration", i, ":", j)
    })
  }
}

Variance Percent

var_df <- read_csv(here("data/counts/variance_df.csv"))

for(k in group_name) {
  for(i in method_name) {
    for(j in samp_nms) {
      message("Starting Group ", k, ", Method ", i, ", Sample: ", j, " plot")
      
      var_percent <- multimap_group_list[[k]][[i]][[j]] %>%
        dplyr::mutate("transcript_id" = str_remove(Name, "\\..*")) %>%
        inner_join(var_df,
                   by = "transcript_id") %>%
        dplyr::arrange(desc(rowvar)) %>%
        ggplot(aes(x = rowvar, y = 100*percent_est_from_multimap)) +
        geom_point() +
        labs(x = "Transcript Count Variance",
             y = "Reads that Multi-Mapped (%)") +
        theme_bw()
      
      tryCatch({
        ggsave(plot = var_percent,
               filename = paste0(j, "_var_percent.png"),
               path = paste0(here("figures/multi_map/", k, "/var_percent/"),
                             tolower(i), "/"),
               height = 150,
               width = 175,
               dpi = 400,
               units = "mm", create.dir = TRUE)
      }, error = function(e) {
        cat("No data found for iteration", k, ":", i, "-", j)
      })
    }
  }
}

for(i in group_name) {
  for(j in samp_nms) {
    message("Starting Group ", i, ", Sample: ", j, " plot")
    
    var_percent <- multimap_group_list[[i]][["STAR"]][[j]] %>%
      dplyr::mutate(group = "STAR") %>%
      rbind(multimap_group_list[[i]][["Salmon"]][[j]] %>%
              dplyr::mutate(group = "Salmon")) %>%
      rbind(multimap_group_list[[i]][["HISAT2"]][[j]] %>%
              dplyr::mutate(group = "HISAT2")) %>%
      dplyr::mutate("transcript_id" = str_remove(Name, "\\..*")) %>%
      inner_join(var_df,
                 by = "transcript_id") %>%
      dplyr::arrange(desc(rowvar)) %>%
      ggplot(aes(x = rowvar, y = 100*percent_est_from_multimap)) +
      geom_point(size = 2) +
      facet_grid(~ group) +
      ggtitle(paste0("Sample: ", j)) +
      labs(x = "Transcript Count Variance",
           y = "Reads that Multi-Mapped (%)") +
      theme_bw() +
      theme(
        axis.title = element_text(colour = "black", size = 14),
        axis.text = element_text(colour = "black", size = 12),
        plot.title = element_text(colour = "black", size = 14,
                                  face = "bold"),
        strip.background = element_rect(fill = "lightblue")
      )
    
    tryCatch({
      ggsave(plot = var_percent,
             filename = paste0(j, "_var_percent.png"),
             path = paste0(here("figures/multi_map/", i, "/var_percent/")),
             height = 100,
             width = 205,
             dpi = 400,
             units = "mm", create.dir = TRUE)
    }, error = function(e) {
      cat("No data found for iteration", i, ":", j)
    })
  }
}
## No data found for iteration hisat2_unique : SRR13401116
## No data found for iteration hisat2_unique : SRR13401117
## No data found for iteration hisat2_unique : SRR13401118
## No data found for iteration hisat2_unique : SRR13401119
## No data found for iteration hisat2_unique : SRR13401120
## No data found for iteration hisat2_unique : SRR13401121
## No data found for iteration hisat2_unique : SRR13401122
## No data found for iteration hisat2_unique : SRR13401123
## No data found for iteration kallisto_unique : SRR13401116
## No data found for iteration kallisto_unique : SRR13401117
## No data found for iteration kallisto_unique : SRR13401118
## No data found for iteration kallisto_unique : SRR13401119
## No data found for iteration kallisto_unique : SRR13401120
## No data found for iteration kallisto_unique : SRR13401121
## No data found for iteration kallisto_unique : SRR13401122
## No data found for iteration kallisto_unique : SRR13401123
## No data found for iteration star_unique : SRR13401116
## No data found for iteration star_unique : SRR13401117
## No data found for iteration star_unique : SRR13401118
## No data found for iteration star_unique : SRR13401119
## No data found for iteration star_unique : SRR13401120
## No data found for iteration star_unique : SRR13401121
## No data found for iteration star_unique : SRR13401122
## No data found for iteration star_unique : SRR13401123
## No data found for iteration salmon_unique : SRR13401116
## No data found for iteration salmon_unique : SRR13401117
## No data found for iteration salmon_unique : SRR13401118
## No data found for iteration salmon_unique : SRR13401119
## No data found for iteration salmon_unique : SRR13401120
## No data found for iteration salmon_unique : SRR13401121
## No data found for iteration salmon_unique : SRR13401122
## No data found for iteration salmon_unique : SRR13401123

Number of transcripts

star_multitx <- star_multi_mapping[[1]] %>%
  dplyr::mutate(has_multi = case_when(AmbigCount > 0 ~ TRUE,
                                      AmbigCount == 0 ~ FALSE))
star_multitx$has_multi %>% table()
## .
##  FALSE   TRUE 
##   9253 153101
salmon_multitx <- sa_multi_mapping[[1]] %>%
  dplyr::mutate(has_multi = case_when(AmbigCount > 0 ~ TRUE,
                                      AmbigCount == 0 ~ FALSE))
salmon_multitx$has_multi %>% table()
## .
##  FALSE   TRUE 
##   5949 172697
hisat2_multitx <- hisat2_multi_mapping[[1]] %>%
  dplyr::mutate(has_multi = case_when(AmbigCount > 0 ~ TRUE, 
                                      AmbigCount == 0 ~ FALSE))
hisat2_multitx$has_multi %>% table()
## .
##  FALSE   TRUE 
##   7830 169044